#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-InitialExperiments/SFARI_genes/R_Markdowns/')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis)
library(biomaRt)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load raw count expression matrix and pubmed counts by gene
datExpr = read.csv('./../../FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv')
pubmed_counts = read.csv('./../Data/pubmed_count_by_gene.csv')
Convert ensembl IDs to gene names
# Get gene names from Ensembl IDs
getinfo = c('ensembl_gene_id','external_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org')
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=datExpr$X, mart=mart)
datGenes = datGenes[match(datExpr$X, datGenes$ensembl_gene_id),]
rm(getinfo, mart)
datGenes = datGenes %>% mutate('meanExpr' = rowMeans(datExpr[,-1])) %>%
left_join(pubmed_counts, by=c('external_gene_id'='Gene')) %>% na.omit
*Perhaps later I should remove the entries with gene name 5s_rRNA
Very heavy right tail
ggplotly(datGenes %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
Very heavy right tail too
summary(datGenes$Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 723 10 7711933
datGenes %>% arrange(desc(Count)) %>% distinct(external_gene_id, .keep_all=TRUE) %>%
head(20) %>% kable(caption='Top 20 genes by number of publications')
| ensembl_gene_id | external_gene_id | gene_biotype | meanExpr | Count |
|---|---|---|---|---|
| ENSG00000180176 | TH | protein_coding | 5.4090909 | 7711933 |
| ENSG00000177606 | JUN | protein_coding | 1970.3750000 | 2604914 |
| ENSG00000136999 | NOV | protein_coding | 348.3522727 | 2332690 |
| ENSG00000228491 | MICE | pseudogene | 0.0000000 | 1634646 |
| ENSG00000173599 | PC | protein_coding | 803.6136364 | 1464096 |
| ENSG00000133424 | LARGE | protein_coding | 1544.2727273 | 1393404 |
| ENSG00000164458 | T | protein_coding | 0.3522727 | 1215682 |
| ENSG00000154059 | IMPACT | protein_coding | 292.6931818 | 910176 |
| ENSG00000087085 | ACHE | protein_coding | 234.3409091 | 818059 |
| ENSG00000119335 | SET | protein_coding | 1844.2500000 | 502402 |
| ENSG00000062485 | CS | protein_coding | 973.9204545 | 407959 |
| ENSG00000087111 | PIGS | protein_coding | 375.0795455 | 374551 |
| ENSG00000115718 | PROC | protein_coding | 3.9431818 | 359016 |
| ENSG00000168453 | HR | protein_coding | 701.6818182 | 256256 |
| ENSG00000175084 | DES | protein_coding | 31.4772727 | 217417 |
| ENSG00000105976 | MET | protein_coding | 780.5568182 | 186947 |
| ENSG00000188158 | NHS | protein_coding | 572.5681818 | 169222 |
| ENSG00000204490 | TNF | protein_coding | 0.0000000 | 167834 |
| ENSG00000169083 | AR | protein_coding | 163.5568182 | 166496 |
| ENSG00000010610 | CD4 | protein_coding | 204.3068182 | 164440 |
ggplotly(datGenes %>% ggplot(aes(Count+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
There is a positive relation between these two variables and the correlation is quite high but only when transforming to logarithmic scale the variables
datGenes %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(color='#0099cc', alpha=0.1) +
geom_smooth(color='gray', alpha=0.3) + scale_x_log10() + scale_y_log10() +
ggtitle(paste0('Cor=(' ,cor(log10(datGenes$meanExpr+1), log10(datGenes$Count+1)),')')) + theme_minimal()
Differentiating protein coding genes from the rest, the correlation decreases when separating the genes (I would have expected the opposite)
*The red stripes on the plot correspond to non coding genes that have many ensembl IDs, like Y_RNA, snoU13, U3, or SNORD112
datGenes %>% mutate(protein_coding = gene_biotype=='protein_coding') %>%
ggplot(aes(meanExpr+1, Count+1, color=protein_coding)) +
geom_point(alpha=0.1) + geom_smooth() +
scale_x_log10() + scale_y_log10() +
ggtitle(paste0('CorPC=(',
cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype=='protein_coding'],
log10(datGenes$Count+1)[datGenes$gene_biotype=='protein_coding']),'), ',
'CorNPC=(',
cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype!='protein_coding'],
log10(datGenes$Count+1)[datGenes$gene_biotype!='protein_coding']),')')) +
theme_minimal()